Normal Table HET SNP Frequency Analysis

###load targeted normal read counts
targeted_normal_reads <- read.table("../Gaudin_thesis_data/97_sample_st.dat", stringsAsFactors = F, header = TRUE, sep=",")

##format targeted normal reads tables -- [[1]]: VAF frequency [[2]] coverage at SNP site
targeted_normal_read_table <- format_het_table(targeted_normal_reads,25,75)

##indices of normal samples run on NovaSeq 6000
nova_indices <- readRDS("../Gaudin_thesis_data/nova_indices.rds")

##only include NovaSeq normal samples 
targeted_normal_read_table[[1]] <- targeted_normal_read_table[[1]][,c(c(1,2),c(nova_indices + 2))] ##nova_indices + 2 accounts for chromosome and position columns
targeted_normal_read_table[[2]] <- targeted_normal_read_table[[2]][,c(c(1,2),c(nova_indices + 2))]

#Figure 1. Mean VAF for heterozygous SNP sites in normal pool.

st_alt_table <- targeted_normal_read_table[[1]][,3:ncol(targeted_normal_read_table[[1]])] ## remove chromosome and position
st_cov_table <- targeted_normal_read_table[[2]][,3:ncol(targeted_normal_read_table[[2]])] ## remove chromosome and position
avg_alts <- c() 
avg_covs <- c()
for(i in 1:nrow(st_alt_table)){
  indices <- which(st_alt_table[i,] > 25 & st_alt_table[i,] < 75 & st_cov_table[i,] > 100) ##only include SNP sites with VAF frequency between 25 - 75 and coverage > 100, 
  if(length(indices) > 7){ ##only average SNPs that are heterozygous at at least 7 SNP sites
    avg_alts <- c(avg_alts, mean(as.numeric(st_alt_table[i,indices])))
    avg_covs <- c(avg_covs, mean(as.numeric(st_cov_table[i,indices]))) 
  }
}
cov_df <- data.frame(Average_vaf = avg_alts, Average_coverage = avg_covs)
cov_df <- cov_df[!is.na(cov_df$Average_vaf),]
print("number of SNPs included")
## [1] "number of SNPs included"
print(nrow(cov_df))## print number of SNPs 
## [1] 551
print(ggplot(cov_df, aes(x=Average_coverage, y=Average_vaf)) + xlab("SNP Site Average Coverage") + ylab("Mean VAF for SNP") + geom_point() + geom_vline(aes(xintercept=250), colour="#BB0000", linetype="dashed"))

gene_table <- read.table("GENES_chrom_start_stop.txt", header=TRUE)

##adjust for targeting of genes for +/- 1MB , but dont overlap MSH2 and MSH6
gene_table$Start[-c(4)] <- gene_table$Start[-c(4)] - 1000000
gene_table$Stop[-c(3)] <- gene_table$Stop[-c(3)] + 1000000
print(gene_table)
##     GENE Chromosome     Start      Stop
## 1  BRCA2         13  31889611  33973809
## 2  BRCA1         17  40196312  42277500
## 3   MSH2          2  46630108  47789450
## 4   MSH6          2  47922669  49037240
## 5   MLH1          3  36034823  38107380
## 6   PMS2          7   5012870   7048756
## 7   CDK4         12  57141510  59149796
## 8   MDM2         12  68201956  70239214
## 9  ERBB2         17  36844167  38886697
## 10  EGFR          7  54086714  56324313
## 11   MET          7 115312444 117438440
## 12   MYC          8 127747680 129753680

Figure 9. Histograms showing distributions of 20 targeted SNPs within EGFR (left) and BRCA1 (right) genes in normal samples (n = 47).

normal_targeted_het_counts <- gene_het_count_table(gene_table,targeted_normal_read_table,25,75,100)
for(i in 1:ncol(normal_targeted_het_counts)){
  print(hist(normal_targeted_het_counts[,i], breaks = max(normal_targeted_het_counts[,i]), main=paste("Heterozygous SNPs for Each Sample (n = 47) Targeting", colnames(normal_targeted_het_counts)[i]), xlab="Number of Heterozygous SNPs", ylab="Number of Samples"))
}

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17
## 
## $counts
##  [1] 7 2 2 0 0 4 3 2 1 3 4 2 1 4 4 1 7
## 
## $density
##  [1] 0.14893617 0.04255319 0.04255319 0.00000000 0.00000000 0.08510638
##  [7] 0.06382979 0.04255319 0.02127660 0.06382979 0.08510638 0.04255319
## [13] 0.02127660 0.08510638 0.08510638 0.02127660 0.14893617
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
## 
## $counts
##  [1] 19  2  2  2  3  0  0  0  0  0  0  0  0  0  1  0  1  0  1 16
## 
## $density
##  [1] 0.40425532 0.04255319 0.04255319 0.04255319 0.06382979 0.00000000
##  [7] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## [13] 0.00000000 0.00000000 0.02127660 0.00000000 0.02127660 0.00000000
## [19] 0.02127660 0.34042553
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5 18.5 19.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
## 
## $counts
##  [1] 19  2  1  0  0  0  1  1  1  0  0  1  1  1  2  2  1  2 12
## 
## $density
##  [1] 0.40425532 0.04255319 0.02127660 0.00000000 0.00000000 0.00000000
##  [7] 0.02127660 0.02127660 0.02127660 0.00000000 0.00000000 0.02127660
## [13] 0.02127660 0.02127660 0.04255319 0.04255319 0.02127660 0.04255319
## [19] 0.25531915
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5 18.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
## 
## $counts
##  [1] 4 0 0 2 9 5 2 5 3 3 7 3 1 1 0 2
## 
## $density
##  [1] 0.08510638 0.00000000 0.00000000 0.04255319 0.19148936 0.10638298
##  [7] 0.04255319 0.10638298 0.06382979 0.06382979 0.14893617 0.06382979
## [13] 0.02127660 0.02127660 0.00000000 0.04255319
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
## 
## $counts
##  [1] 21  2  2  0  0  0  1  0  0  0  0  0  1  1  0  1  0  1 10  7
## 
## $density
##  [1] 0.44680851 0.04255319 0.04255319 0.00000000 0.00000000 0.00000000
##  [7] 0.02127660 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## [13] 0.02127660 0.02127660 0.00000000 0.02127660 0.00000000 0.02127660
## [19] 0.21276596 0.14893617
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5 18.5 19.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
## 
## $counts
##  [1] 13  2  2  0  0  2  3  5 10  4  0  1  1  1  2  1
## 
## $density
##  [1] 0.27659574 0.04255319 0.04255319 0.00000000 0.00000000 0.04255319
##  [7] 0.06382979 0.10638298 0.21276596 0.08510638 0.00000000 0.02127660
## [13] 0.02127660 0.02127660 0.04255319 0.02127660
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
## 
## $counts
##  [1] 10  1  6  1  2  6  1  2  1  0  1  2  2  2  1  1  4  1  3
## 
## $density
##  [1] 0.21276596 0.02127660 0.12765957 0.02127660 0.04255319 0.12765957
##  [7] 0.02127660 0.04255319 0.02127660 0.00000000 0.02127660 0.04255319
## [13] 0.04255319 0.04255319 0.02127660 0.02127660 0.08510638 0.02127660
## [19] 0.06382979
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5 18.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
## 
## $counts
##  [1] 2 2 2 3 2 3 4 4 2 5 5 2 3 2 3 3
## 
## $density
##  [1] 0.04255319 0.04255319 0.04255319 0.06382979 0.04255319 0.06382979
##  [7] 0.08510638 0.08510638 0.04255319 0.10638298 0.10638298 0.04255319
## [13] 0.06382979 0.04255319 0.06382979 0.06382979
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
## 
## $counts
##  [1] 18  8  0  2  0  0  2  1  2  0  2  0  0  2  1  0  5  0  4
## 
## $density
##  [1] 0.38297872 0.17021277 0.00000000 0.04255319 0.00000000 0.00000000
##  [7] 0.04255319 0.02127660 0.04255319 0.00000000 0.04255319 0.00000000
## [13] 0.00000000 0.04255319 0.02127660 0.00000000 0.10638298 0.00000000
## [19] 0.08510638
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5 18.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
## 
## $counts
##  [1] 1 1 4 5 5 4 5 6 3 8 1 2 1 1
## 
## $density
##  [1] 0.02127660 0.02127660 0.08510638 0.10638298 0.10638298 0.08510638
##  [7] 0.10638298 0.12765957 0.06382979 0.17021277 0.02127660 0.04255319
## [13] 0.02127660 0.02127660
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13
## 
## $counts
##  [1] 12  6  2  1  3  5  0  3  0  2  2 10  1
## 
## $density
##  [1] 0.25531915 0.12765957 0.04255319 0.02127660 0.06382979 0.10638298
##  [7] 0.00000000 0.06382979 0.00000000 0.04255319 0.04255319 0.21276596
## [13] 0.02127660
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13
## 
## $counts
##  [1]  5  5  4  1  2  1  1  6 10  5  4  2  1
## 
## $density
##  [1] 0.10638298 0.10638298 0.08510638 0.02127660 0.04255319 0.02127660
##  [7] 0.02127660 0.12765957 0.21276596 0.10638298 0.08510638 0.04255319
## [13] 0.02127660
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
#normal_targeted_het_counts

Combining individual snp-pileup dat files to form normal and tumor input files (with on and off target reads)

Necessary for large .vaf files with many samples

clin_samples <- read.table("../Gaudin_thesis_data/clin_samples_standard.txt")$V1 ##sample names for clinical files
clin_sample_paths <- paste("../Gaudin_thesis_data/standard/",clin_samples,".dat",sep="")
clin_sample_counts_merged <- merge_count_files(clin_sample_paths) ##merged SNP counts file

normal_samples <- read.table("../Gaudin_thesis_data/nova_standard_names.txt")$V1 ##sample names for normal files
normal_sample_paths <- paste("../Gaudin_thesis_data/nova_standard/",normal_samples,".dat",sep="")
normal_sample_counts_merged <- merge_count_files(normal_sample_paths) ##merged SNP counts file

saveRDS(normal_sample_counts_merged,"normal_table_merged.rds")
saveRDS(clin_sample_counts_merged,"clinical_table_merged.rds")

VAF Z-score Analysis for Tumor and Normal Samples

Load saved count files and sample names
clin_samples <- read.table("../Gaudin_thesis_data/clin_samples_standard.txt")$V1 ##sample names for clinical files
normal_samples <- read.table("../Gaudin_thesis_data/nova_standard_names.txt")$V1 ##sample names for normal files
tumor_countsmerged <- readRDS("../Gaudin_thesis_data/clinical_table_merged.rds") ##load saved counts_merged file
normal_countsmerged <- readRDS("../Gaudin_thesis_data/normal_table_merged.rds") ##load saved counts_merged file
normal_and_clinical_sample_names <- c(as.character(normal_samples),as.character(clin_samples)) ##combine into single list
Calculate Average absolute VAF Z-score for each tumor and normal sample
normal_tables <- format_het_table(normal_countsmerged,5,95) ##[[1]]: VAF frequency [[2]] coverage at SNP site
tumor_tables <- format_het_table(tumor_countsmerged,5,95) ##[[1]]: VAF frequency [[2]] coverage at SNP site
complete_vaf_df <- VAF_Z_Score_analysis(normal_tables, tumor_tables, 250, 25, 75, 5, 95, 7,normal_and_clinical_sample_names)
saveRDS(complete_vaf_df,"../Gaudin_thesis_data/cdf_250_25_75.rds")
complete_vaf_df <- readRDS("../Gaudin_thesis_data/cdf_250_25_75.rds")
#complete_vaf_df
gene_specific_z_scores <- gene_specific_tables(gene_table, tumor_tables, normal_tables, clin_samples, 25,75,5,95,7,250)
gene_z_df <- gene_specific_z_scores[[1]]
gene_counts_df <- gene_specific_z_scores[[2]]
#saveRDS(gene_z_df,"../Gaudin_thesis_data/clin_gene_z.rds")
#saveRDS(gene_counts_df,"../Gaudin_thesis_data/clin_gene_counts.rds")
sample_names <- read.table("../Gaudin_thesis_data/nova_clin_names.txt")$V1
sample_files <- read.table("../Gaudin_thesis_data/nova_clin_tlen_paths.txt")$V1

for(i in 1:length(sample_files)){
  frag_lengths <- read.table(as.character(sample_files[i]))$V1
  if(i == 1){
    nt_frags_binsize_5 <- bin_fragments(frag_lengths,5,1,600,as.character(sample_names[i]))
  }else{
    nt_frags_binsize_5 <- merge(nt_frags_binsize_5,bin_fragments(frag_lengths,5,1,600,sample_names[i]), by=c('start','end'))
  }
  print(i)
}

saveRDS(nt_frags_binsize_5,"../Gaudin_thesis_data/nt_clin_frags_binsize_5.rds")
frag_table <- readRDS("../Gaudin_thesis_data/nt_clin_frags_binsize_5.rds")
frag_table$start <- paste("s",frag_table$start,"_",frag_table$end,sep="")
frag_table_starts_ends <- frag_table$start
frag_table <- frag_table[,-c(1,2)]
frag_table <- t(frag_table)
colnames(frag_table) <- frag_table_starts_ends 
frag_table <- as.data.frame(frag_table)
frag_table <- frag_table[,-c(ncol(frag_table))]
frag_table$sample_names <- row.names(frag_table)
frag_table <- normalize_frag_table(frag_table)
frag_table <- frag_table[,c(c(26:31),c(35:40),c(51:75))] #bins between 125 - 155, 170 - 200, 250 - 375 
frag_table_normal <- frag_table[c(1:47),]
frag_table_clin <- frag_table[c(48:nrow(frag_table)),]
frag_table_normal$sample_names = rownames(frag_table_normal)
frag_table_clin$sample_names = rownames(frag_table_clin)
frag_avg_z_scores <- calculate_frag_z_scores(frag_table_clin,frag_table_normal)
saveRDS(frag_avg_z_scores,"../Gaudin_thesis_data/clin_frag_z_scores_5.rds")
Load data and merge tables
clin_mut_percent_table <- readRDS("../Gaudin_thesis_data/clin_ctDNA_est_table.rds")
clin_sample_key_table <- readRDS("../Gaudin_thesis_data/key_reference.rds") ##data containing other sample identifiers 
complete_vaf_df$sample_names <- str_replace(complete_vaf_df$sample_names, "_DONOR", "-DONOR") ##adjust to fit sample names in other files
cnv_table <- read.table("../Gaudin_thesis_data/data_CNA.txt", header = TRUE) ##table showing called cnvs
####formatting for merging######
colnames(cnv_table) <- str_replace_all(colnames(cnv_table), "[.]", "-")
row.names(cnv_table) <- cnv_table$Hugo_Symbol
cnv_table <- cnv_table[,-c(1)]
cnv_table <- t(cnv_table)
cnv_table <- data.frame(cnv_table) ##convert from atomic vector
##egfr flagged mutaions from cbio
egfr_cnv <- data.frame(Tumor_Sample_Barcode = row.names(cnv_table), egfr_cn = cnv_table$EGFR)
all_data_table <- merge(clin_mut_percent_table, egfr_cnv, all = TRUE)
all_data_table$mean_AF[is.na(all_data_table$mean_AF)] <- 0 
all_data_table  <- merge(all_data_table, clin_sample_key_table, by.x = "Tumor_Sample_Barcode", by.y = "id_1") ##id_1 = tumor sample barcode, id_2 == sample name
all_data_table <- merge(complete_vaf_df,all_data_table,by.x = "sample_names", by.y = "id_2", all.x = TRUE)
all_data_table$mean_AF[is.na(all_data_table$mean_AF)] <- 0 
frag_avg_z_scores$sample_names <- str_replace(frag_avg_z_scores$sample_names, "_DONOR","-DONOR")
all_data_table <- clin_total_table <- merge(all_data_table,frag_avg_z_scores,by = "sample_names")
all_data_table <- all_data_table[-c(which(all_data_table$pool == "tumor" & is.na(all_data_table$Tumor_Sample_Barcode))),] ##remove tumor samples (4) which don't have egfr or barcode data
##add clinical patient IDs
patient_ids <- all_data_table$Tumor_Sample_Barcode 
patient_ids <-str_replace_all(patient_ids, "-T.*", "")
all_data_table$patient_ids <- patient_ids
##add egfr gene specific Z scores
egfr_z_scores <- data.frame(sample_names = gene_z_df$sample_name, egfr_z_score = gene_z_df$EGFR, egfr_het_SNP_count = gene_counts_df$EGFR) 
all_data_table <- merge(all_data_table, egfr_z_scores, by.x = "sample_names", all.x = TRUE)
#all_data_table
##mark samples with same patient id that show copy number flagged in some samples but not others (-1 means flagged for CNV in other patient samples)
updated_egfr_cn <- c()
for(i in 1:nrow(all_data_table)){
  if(is.na(all_data_table$egfr_cn[i])){
    updated_egfr_cn <- c(updated_egfr_cn,NA)
  }
  else{
    if(all_data_table$egfr_cn[i] == 2){
      #print("hi")
      updated_egfr_cn <- c(updated_egfr_cn, 2)
    }
    else{
      if(sum(all_data_table$egfr_cn[which(all_data_table$patient_ids == all_data_table$patient_ids[i])]) > 0){
        updated_egfr_cn <- c(updated_egfr_cn, -1)
      }
      else{
        updated_egfr_cn <- c(updated_egfr_cn, 0)
      }
    } 
  }
}
all_data_table$updated_egfr_cn <- updated_egfr_cn

Figure 3. Boxplot distributions of mean heterozygous SNP VAF absolute Z-scores for normal pool

ggplot(all_data_table, aes(x = pool, y = VAF_Z_Score)) + geom_boxplot() + xlab("Pool") + ylab("Mean Absolute heterozygous SNP VAF Z-score")

#### Figure 4. Histograms showing distributions of mean heterozygous SNP VAF absolute Z-scores for normal pool

ggplot(all_data_table, aes(x=VAF_Z_Score, fill=pool, color=pool), bins = 90) + geom_histogram(position="identity", alpha=0.5) + theme(legend.position="top") + xlab("Average SNP VAF Z-Score") + ylab("Sample Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(all_data_table, aes(x=VAF_Z_Score, fill=pool, color=pool), bins = 90) + geom_histogram(position="identity", alpha=0.5) +
  theme(legend.position="top") + xlim(0.5,2) + xlab("Average SNP VAF Z-Score") + ylab("Sample Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 72 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).

###### Figure 5. Scatter plots showing distributions of mean heterozygous SNP VAF absolute Z-scores for normal pool

p <- ggplot() 
p <- p + geom_point(data = all_data_table[clin_total_table$pool == "tumor",], aes(x=mean_AF, y=abs_frag_z_score,color=pool),alpha = 0.5)
p <- p + geom_point(data = all_data_table[clin_total_table$pool == "normal",], aes(x=mean_AF, y=abs_frag_z_score,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean Fragment Length Z-score")
print(p)
## Warning: Removed 4 rows containing missing values (geom_point).

g <- p + xlim(0,0.2) + ylim(0.1,3)
print(g)
## Warning: Removed 102 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).

Figure 6. Receiver operator characteristic curves for predicting samples that came from clinical cancer patient samples or normal samples with VAF Z-score

print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Sensitivity"
## [1] 0.3326271
## [1] "AUC"
## [1] 0.6494771
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Sensitivity"
## [1] 0.4181287
## [1] "AUC"
## [1] 0.7085977
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Sensitivity"
## [1] 0.9626168
## [1] "AUC"
## [1] 0.9976138
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)

Figure 8. Scatter plot distribution for tumor samples (n = 301) with 7 or more heterozygous SNPs within targeted EGFR gene

ggplot(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$egfr_het_SNP_count > 7),], aes(x=mean_AF, y= egfr_z_score, color = as.character(updated_egfr_cn))) + geom_point() + ylab("Mean Z-Score for EGFR Heterozygous SNPs") + xlab("Estimated ctDNA Fraction") + ggtitle("EGFR Mean SNP VAF Z-Score vs Predicted ctDNA Fraction and CNV") + scale_color_discrete(name = "Predicted CNV Amplification", labels = c("Amplified in other samples from same patient", "No amplification", "Amplification")) 

print("number of SNPs")
## [1] "number of SNPs"
print(length(which(all_data_table$egfr_het_SNP_count > 7)))
## [1] 401
print("number of SNPs from patients with no flagged EGFR cnv")
## [1] "number of SNPs from patients with no flagged EGFR cnv"
print(length(which(all_data_table$egfr_het_SNP_count > 7 & all_data_table$updated_egfr_cn == 0)))
## [1] 390
print("number of SNPs from samples with flagged EGFR cnv")
## [1] "number of SNPs from samples with flagged EGFR cnv"
print(length(which(all_data_table$egfr_het_SNP_count > 7 & all_data_table$updated_egfr_cn == 2)))
## [1] 9
print("number of SNPs from samples with no flagged EGFR cnv but patients with flagged egfr CN in other samples")
## [1] "number of SNPs from samples with no flagged EGFR cnv but patients with flagged egfr CN in other samples"
print(length(which(all_data_table$egfr_het_SNP_count > 7 & all_data_table$updated_egfr_cn == -1)))
## [1] 2

Figure 12. Histograms showing distributions of mean fragment length absolute Z-scores for normal pool

ggplot(all_data_table, aes(x=abs_frag_z_score, fill=pool, color=pool), bins = 90) + geom_histogram(position="identity", alpha=0.5) + theme(legend.position="top") + xlab("Mean Fragment Length Z-Score") + ylab("Sample Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(all_data_table, aes(x=abs_frag_z_score, fill=pool, color=pool), bins = 90) + geom_histogram(position="identity", alpha=0.5) +
  theme(legend.position="top") + xlim(0.1,3) + xlab("Mean Fragment Length Z-Score") + ylab("Sample Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 96 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).

Figure 13. Scatter plots showing distributions of mean absolute fragment length Z-scores for normal pool

p <- ggplot() 
p <- p + geom_point(data = clin_total_table[clin_total_table$pool == "tumor",], aes(x=mean_AF, y=abs_frag_z_score,color=pool),alpha = 0.5)
p <- p + geom_point(data = clin_total_table[clin_total_table$pool == "normal",], aes(x=mean_AF, y=abs_frag_z_score,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean Fragment Length Z-score")
print(p)

g <- p + xlim(0,0.2) + ylim(0.1,3)
print(g)
## Warning: Removed 100 rows containing missing values (geom_point).

Figure 14. ROC curves for classifying samples as clinical cancer patient samples or normal samples

print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Sensitivity"
## [1] 0.2605932
## [1] "AUC"
## [1] 0.8364587
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Sensitivity"
## [1] 0.3187135
## [1] "AUC"
## [1] 0.866368
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Sensitivity"
## [1] 0.5607477
## [1] "AUC"
## [1] 0.9407437
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)

Figure 15. Scatter plot demonstrating the correlation between mean SNP VAF Z-scores, mean fragment length Z-scores, and estimated ctDNA fraction

print(ggplot(all_data_table[which(all_data_table$pool == "tumor"),], aes(x=abs_frag_z_score, y=VAF_Z_Score, color = mean_AF)) + geom_point() + ylab("Mean SNP VAF Z-score") + xlab("Mean Fragment Length Z-Score") + labs(color = "Estimated ctDNA Fraction"))

print("Correlation for VAF Z-score and Frag Z Score in all samples")
## [1] "Correlation for VAF Z-score and Frag Z Score in all samples"
print(cor(all_data_table$VAF_Z_Score, all_data_table$abs_frag_z_score))
## [1] 0.5649104
print("Correlation for VAF Z-score and Frag Z Score in tumor samples")
## [1] "Correlation for VAF Z-score and Frag Z Score in tumor samples"
print(cor(all_data_table[which(all_data_table$pool == "tumor"),]$VAF_Z_Score, all_data_table[which(all_data_table$pool == "tumor"),]$abs_frag_z_score))
## [1] 0.5564904

Figure 16. Scatter plot demonstrating the correlation between mean SNP VAF Z-scores, and mean fragment length Z-scores, and for both normal

f <- ggplot() + geom_point(data = all_data_table[which(all_data_table$pool == "tumor"),], aes(x=abs_frag_z_score, y=VAF_Z_Score, color = pool)) + ylab("Mean SNP VAF Z-score") + xlab("Mean Fragment Length Z-Score") + labs(color = "Pool")
f <- f + geom_point(data = all_data_table[which(all_data_table$pool == "normal"),], aes(x=abs_frag_z_score, y=VAF_Z_Score, color = pool))
print(f)

Figure 17. ROC curves for classifying samples as clinical cancer patient samples or normal samples (n = 47) using both VAF Z-score and fragment length Z-score

print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "fragment_length_with_VAF_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Sensitivity"
## [1] 0.4258475
## [1] "AUC"
## [1] 0.6494771
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "fragment_length_with_VAF_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Sensitivity"
## [1] 0.5204678
## [1] "AUC"
## [1] 0.7085977
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "fragment_length_with_VAF_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Sensitivity"
## [1] 0.9626168
## [1] "AUC"
## [1] 0.9976138
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)

Figure 10. Average fraction of total cfDNA for fragment lengths of normal pool samples (n = 47) and a). clinical, tumor pool samples (n = 472) b). clinical, tumor pool samples with estimated ctDNA fraction above 0 (n = 342) c). clinical, tumor pool samples with estimated ctDNA fraction above 0.05 (n = 107)

frag_table <- readRDS("../Gaudin_thesis_data/nt_clin_frags_binsize_5.rds")
frag_table$start <- paste("s",frag_table$start,"_",frag_table$end,sep="")
frag_table_starts_ends <- frag_table$start
frag_table <- frag_table[,-c(1,2)]
frag_table <- t(frag_table)
colnames(frag_table) <- frag_table_starts_ends 
frag_table <- as.data.frame(frag_table)
frag_table <- frag_table[,-c(ncol(frag_table))]
frag_table$sample_names <- row.names(frag_table)
frag_table <- normalize_frag_table(frag_table)
frag_table$sample_names <- str_replace(frag_table$sample_names, "_DONOR", "-DONOR")
normal_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "normal"),]$sample_names)
tumor_all_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "tumor"),]$sample_names)
tumor_above_0_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),]$sample_names)
tumor_above_05_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),]$sample_names)

normal_frag_table <- frag_table[normal_indices,-c(ncol(frag_table))]
clin_table_all <- frag_table[tumor_all_indices,-c(ncol(frag_table))]
normal_avgs <- colSums(normal_frag_table)/nrow(normal_frag_table)
clin_avgs_all <- colSums(clin_table_all)/nrow(clin_table_all)
all_avgs <- c(normal_avgs,clin_avgs_all)

clin_table_above_0 <- frag_table[tumor_above_0_indices,-c(ncol(frag_table))]
clin_avgs_above_0 <- colSums(clin_table_above_0)/nrow(clin_table_above_0)
clin_table_above_05 <- frag_table[tumor_above_05_indices,-c(ncol(frag_table))]
clin_avgs_above_05 <- colSums(clin_table_above_05)/nrow(clin_table_above_05)

all_avgs_above_0 <-  c(normal_avgs,clin_avgs_above_0)
all_avgs_above_05 <-  c(normal_avgs,clin_avgs_above_05)

repeated <- length(all_avgs)/2
normal_names <- rep("normal",repeated)
tumor_names <- rep("tumor",repeated)
all_names <- c(normal_names,tumor_names)
intervals <- c(seq(1,595,5),seq(1,595,5))
fragments_df_all <- data.frame(avg_coverage = all_avgs, pool = all_names, bin = intervals)
fragments_df_above_0 <- data.frame(avg_coverage = all_avgs_above_0, pool = all_names, bin = intervals)
fragments_df_above_05 <- data.frame(avg_coverage = all_avgs_above_05, pool = all_names, bin = intervals)

print(ggplot(fragments_df_all, aes(bin, avg_coverage, colour = pool)) + geom_path() + ylab("Fraction cfDNA") + xlab("Fragment Length") + ggtitle("Average Fragment Length Distributions for HiSeq Normals and Ret Tumor Samples") + xlim(100,400))
## Warning: Removed 118 rows containing missing values (geom_path).

print(ggplot(fragments_df_above_0, aes(bin, avg_coverage, colour = pool)) + geom_path() + ylab("Fraction cfDNA") + xlab("Fragment Length") + ggtitle("Average Fragment Length Distributions for HiSeq Normals and Ret Tumor Samples") + xlim(100,400))
## Warning: Removed 118 rows containing missing values (geom_path).

print(ggplot(fragments_df_above_05, aes(bin, avg_coverage, colour = pool)) + geom_path() + ylab("Fraction cfDNA") + xlab("Fragment Length") + ggtitle("Average Fragment Length Distributions for HiSeq Normals and Ret Tumor Samples") + xlim(100,400))
## Warning: Removed 118 rows containing missing values (geom_path).

Figure 11. TSNE and PCA plots showing separation between normal (red) and clinical samples (blue).

TSNE
tsne_table_sd_scaled <- frag_table[c(normal_indices,tumor_all_indices),-c(ncol(frag_table))]
for(i in 1:ncol(tsne_table_sd_scaled)){
min <- min(tsne_table_sd_scaled[,i])
max <- max(tsne_table_sd_scaled[,i])
max_m_min <- max - min
tsne_table_sd_scaled[,i] <- (tsne_table_sd_scaled[,i] - min)/max_m_min 
}
value_vector <- rep("normal",nrow(frag_table))
value_vector[tumor_all_indices] <- "ctDNA_est_0"
value_vector[tumor_above_0_indices] <- "ctDNA_above_0"
value_vector[tumor_above_05_indices] <- "ctDNA_above_05"
value_vector <- value_vector[c(normal_indices,tumor_all_indices)]
#make colors and labels vectors
color_vector <- value_vector
color_vector[which(color_vector == "normal")] <- "2" #red
color_vector[which(color_vector == "ctDNA_est_0")] <- "3" #green
color_vector[which(color_vector == "ctDNA_above_0")] <- "4" #blue
color_vector[which(color_vector == "ctDNA_above_05")] <-  "1" #black
color_vector <- as.numeric(color_vector)
label_vector <- color_vector
label_vector[] <- "*"
#make TSNE
train <- as.matrix(tsne_table_sd_scaled)
tsne <- Rtsne(train, dims = 2, perplexity=40, verbose=FALSE, max_iter = 3000)
plot(tsne$Y, t='n', main="tsne")
text(tsne$Y, labels = label_vector, col = color_vector)
##may need to adjust legend position (x, y are first two parameters)
legend(-20, -5, legend=c("normal", "ctDNA_est_0","ctDNA_above_0", "ctDNA_above_05" ),col=c("red", "green", "blue", "black"), lwd = 1, box.lty=0, box.lwd=0)

###### PCA

pca <- prcomp(t(tsne_table_sd_scaled), scale = FALSE)
pca <- as.data.frame(pca[2]$rotation)
pca$status <- value_vector
print(ggplot(pca) + geom_point(aes(PC1,PC2, fill = status, color = status)))

Support Vector Machine classifier

VAF_Z_table <- data.frame(sample_names = all_data_table$sample_names, pool = all_data_table$pool, mean_AF= all_data_table$mean_AF, VAF_Z_Score = all_data_table$VAF_Z_Score)
VAF_frag_table <- merge(VAF_Z_table, frag_table[,c(c(26:31),c(35:40),c(51:75),c(ncol(frag_table)))], by="sample_names") ##frags with greatest diff between normal and tumor
print("All normal and tumor samples")
## [1] "All normal and tumor samples"
calclulate_mean_SVM_100(VAF_frag_table)
## [1] "true_neg"
## [1] 22.41
## [1] "false_neg"
## [1] 11.09
## [1] "false_pos"
## [1] 24.59
## [1] "true_pos"
## [1] 460.91
print("All normal and tumor samples with ctDNA > 0")
## [1] "All normal and tumor samples with ctDNA > 0"
calclulate_mean_SVM_100(VAF_frag_table[which(VAF_frag_table$pool == "normal" | VAF_frag_table$mean_AF > 0),])
## [1] "true_neg"
## [1] 30.15
## [1] "false_neg"
## [1] 10.91
## [1] "false_pos"
## [1] 16.85
## [1] "true_pos"
## [1] 331.09
print("All normal and tumor samples with ctDNA > 0.05")
## [1] "All normal and tumor samples with ctDNA > 0.05"
calclulate_mean_SVM_100(VAF_frag_table[which(VAF_frag_table$pool == "normal" | VAF_frag_table$mean_AF > 0.05),])
## [1] "true_neg"
## [1] 42.56
## [1] "false_neg"
## [1] 4.87
## [1] "false_pos"
## [1] 4.44
## [1] "true_pos"
## [1] 102.13